This is a reproducibility report for “Autism spectrum disorder: understanding the impact of SNPs on biological pathways in the fetal and adult cortex” study.
Python (version 3.6.9), R (version 4.0.2) and RStudio (version 1.2.5033) were used for data processing, analysis and visualisation.
First, we run CoDeS3D pipeline to get regulatory interactions across fetal cortical tissue: python codes3d/codes3d.py -s data/asd_snps/344_asd_snps.txt -o results/codes3d/fetal_cortex -n Cortical_plate_neurons_Won2016 Germinal_zone_neurons_Won2016 -t Fetal_Brain_Cortex_Walker2019. Then across adult cortical tissue: python codes3d/codes3d.py -s data/asd_snps/344_asd_snps.txt -o results/codes3d/adult_cortex -n Dorsolateral_prefrontal_cortex_cells_Schmitt2016 -t Brain_Cortex
# get number of ASD-associated GWAS snps present in both fetal and adult eqtl databases
asd_snps <- readLines("data/asd_snps/344_asd_snps.txt") # 344
# read significant interactions in fetal and adult cortical tissues
fetal <- read.table("results/codes3d/fetal_cortex/significant_eqtls.txt", header = TRUE, sep = "\t")
adult <- read.table("results/codes3d/adult_cortex/significant_eqtls.txt", header = TRUE, sep = "\t")
# get eQTLs in fetal and adult cortical tissues
fetal_eqtls <- unique(fetal$snp) # 80
adult_eqtls <- unique(adult$snp) # 58
# get eGenes in fetal and adult cortical tissues
fetal_egenes <- unique(fetal$gene) # 81
adult_egenes <- unique(adult$gene) # 44
snp.df <- data.frame(
eqtl_db = rep(c('Adult', 'Fetal'), each=1),
snps = rep(c("eQTL","non-eQTL"), each=2),
number = c(length(adult_eqtls), length(fetal_eqtls), length(asd_snps)-length(adult_eqtls),
length(asd_snps)-length(fetal_eqtls)),
percentage = c(round(length(adult_eqtls)/length(asd_snps)*100, 2),
round(length(fetal_eqtls)/length(asd_snps)*100, 2),
round((length(asd_snps)-length(adult_eqtls))/length(asd_snps)*100, 2),
round((length(asd_snps)-length(fetal_eqtls))/length(asd_snps)*100, 2)))
#pdf("figures/eQTLs_vs_non-eQTLs.pdf", width = 8, height = 9)
ggplot(snp.df, aes(x = snps, y = percentage, fill = eqtl_db)) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
axis.text=element_text(size=20, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 19, colour = "black")) +
geom_text(aes(y = percentage, label = paste0("n=", number)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 8, fontface = 'italic') +
scale_fill_manual(values=c("#92278F" ,"#009444")) +
labs(y = "Percentage")
#dev.off()
We used Fisher’s exact test to check if the proportion of eqtls is significantly different in fetal and adult groups.
# The Fisher’s exact test is used when the total sample size is less than 1000.
eqtl.df <- data.frame(tissue=c('Adult', 'Fetal'),
eqtl=c(length(adult_eqtls), length(fetal_eqtls)),
non_eqtl=c(length(asd_snps)-length(adult_eqtls),
length(asd_snps)-length(fetal_eqtls)))
rownames(eqtl.df) <- eqtl.df[,1]; eqtl.df[,1] <- NULL
f_test <- fisher.test(eqtl.df) # p-value = 0.04531
The proportion of eQTLs is significantly different in two groups with a p-value = 0.04531 (less than the significance level alpha = 0.05).
Thirty ASD-associated eQTLs were shared between fetal and adult cortical tissues.
shared_eqtls <- intersect(fetal_eqtls, adult_eqtls) # 30
shared_noneqtls <- intersect(asd_snps[!(asd_snps %in% fetal_eqtls)],
asd_snps[!(asd_snps %in% adult_eqtls)]) # 236
shared.df <- data.frame(
snps = rep(c("eQTL","non-eQTL")),
number = c(length(shared_eqtls), length(shared_noneqtls)),
percentage = c(round(length(shared_eqtls)/length(asd_snps)*100, 2),
round(length(shared_noneqtls)/length(asd_snps)*100, 2)))
#pdf("figures/shared_eQTL_and_non-eQTL.pdf", width = 5, height = 9)
ggplot(shared.df, aes(x = snps, y = percentage, fill = "#A9A9A9")) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "none",
axis.text=element_text(size=20, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 19, colour = "black")) +
geom_text(aes(y = percentage, label = paste0("n=", number)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 8, fontface = 'italic') +
labs(y = "Percentage") +
scale_fill_manual(values=c("#A9A9A9"))
#dev.off()
We used wANNOVAR tool to obtain information about the locus eQTLs tagged. First, we extracted the funnctional annotation for ASD-associated eQTLs: cut -f1,2,3,6,132 results/wannovar/asd_genome_summary.txt | sort -u > results/wannovar/asd_fun_snp_ann.txt
# Calculate percentage of functionally annotated SNPs
cal_ann <- function(phe, total) {
df <- data.frame(matrix(ncol=3, nrow=0))
colnames(df)<- c("annotation","number", "percent")
for (i in 1:length(phe)){
down <- nrow(phe[which(phe$Annotation=="downstream"), ])
df[nrow(df) + 1,] = list(annotation = "downstream", number = down, percent = down/total*100)
ex <- nrow(phe[which(phe$Annotation=="exonic"), ])
df[nrow(df) + 1,] = list(annotation = "exonic", number = ex, percent = ex/total*100)
inter <- nrow(phe[which(phe$Annotation=="intergenic"), ])
df[nrow(df) + 1,] = list(annotation = "intergenic", number = inter, percent = inter/total*100)
intro <- nrow(phe[which(phe$Annotation=="intronic"), ])
df[nrow(df) + 1,] = list(annotation = "intronic", number = intro, percent = intro/total*100)
ncRNA_ex <- nrow(phe[which(phe$Annotation=="ncRNA_exonic"), ])
df[nrow(df) + 1,] = list(annotation = "ncRNA_exonic", number = ncRNA_ex, percent = ncRNA_ex/total*100)
ncRNA_in <- nrow(phe[which(phe$Annotation=="ncRNA_intronic"), ])
df[nrow(df) + 1,] = list(annotation = "ncRNA_intronic", number = ncRNA_in, percent = ncRNA_in/total*100)
up <- nrow(phe[which(phe$Annotation=="upstream"), ])
df[nrow(df) + 1,] = list(annotation = "upstream", number = up, percent = up/total*100)
UTR3 <- nrow(phe[which(phe$Annotation=="UTR3"), ])
df[nrow(df) + 1,] = list(annotation = "UTR3", number = UTR3, percent = UTR3/total*100)
}
df[!duplicated(df), ]
}
asd_ann <- read.table("results/wannovar/asd_fun_snp_ann.txt", sep = "\t", header=TRUE) # wANNOVAR annotation for all ASD-associated snps
asd_ann_fetal <- subset(asd_ann, asd_ann$SNP %in% fetal_eqtls) # 80
asd_ann_adult <- subset(asd_ann, asd_ann$SNP %in% adult_eqtls) # 58
asd_fun_ann_fetal <- cal_ann(asd_ann_fetal, length(fetal_eqtls))
asd_fun_ann_adult <- cal_ann(asd_ann_adult, length(adult_eqtls))
# Fetal eQTLs
f_number <- c(asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="downstream",][,2],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="exonic",][,2],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="intergenic",][,2],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="intronic",][,2],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="ncRNA_intronic",][,2],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="ncRNA_exonic",][,2],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="UTR3",][,2])
f_percent <- c(asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="downstream",][,3],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="exonic",][,3],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="intergenic",][,3],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="intronic",][,3],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="ncRNA_intronic",][,3],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="ncRNA_exonic",][,3],
asd_fun_ann_fetal[asd_fun_ann_fetal$annotation=="UTR3",][,3])
fetal_eqtls.df <- data.frame(
annotation = c("downstream", "exonic", "intergenic", "intronic", "ncRNA_intronic",
"ncRNA_exonic", "UTR3"),
number = f_number,
percent = f_percent)
#pdf("figures/asd_fetal_fun_ann.pdf", width = 6, height = 4)
ggplot(fetal_eqtls.df, aes(x = annotation, y = percent, fill = "#009444")) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
axis.text=element_text(size=14, colour = "black"),
axis.title=element_text(size=16, colour = "black")) +
scale_fill_manual(values = "#009444") +
geom_text(aes(y = percent, label = paste0(round(percent, digits=2), "%")),
position=position_dodge(width=0.9), vjust = 0.25, hjust = -0.25, color = "black",
size = 3.5) +
scale_y_continuous(limits = c(0, 55)) +
ylab("Percentage") +
coord_flip()
#dev.off()
# Adult eQTLs
a_number <- c(asd_fun_ann_adult[asd_fun_ann_adult$annotation=="downstream",][,2],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="exonic",][,2],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="intergenic",][,2],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="intronic",][,2],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="ncRNA_intronic",][,2],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="ncRNA_exonic",][,2],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="UTR3",][,2])
a_percent <- c(asd_fun_ann_adult[asd_fun_ann_adult$annotation=="downstream",][,3],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="exonic",][,3],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="intergenic",][,3],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="intronic",][,3],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="ncRNA_intronic",][,3],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="ncRNA_exonic",][,3],
asd_fun_ann_adult[asd_fun_ann_adult$annotation=="UTR3",][,3])
adult_eqtls.df <- data.frame(
annotation = c("downstream", "exonic", "intergenic", "intronic", "ncRNA_intronic",
"ncRNA_exonic", "UTR3"),
number = a_number,
percent = a_percent)
#pdf("figures/asd_adult_fun_ann.pdf", width = 6, height = 4)
ggplot(adult_eqtls.df, aes(x = annotation, y = percent, fill = "#92278F")) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
axis.text=element_text(size=14, colour = "black"),
axis.title=element_text(size=16, colour = "black")) +
scale_fill_manual(values = "#92278F") +
geom_text(aes(y = percent, label = paste0(round(percent, digits=2), "%")),
position=position_dodge(width=0.9), vjust = 0.25, hjust = -0.25, color = "black",
size = 3.5) +
scale_y_continuous(limits = c(0, 55)) +
ylab("Percentage") +
coord_flip()
#dev.off()
Enrichment of fetal and adult eQTLs within transcription factor binding sites was determined using SNP2TFBS.
snp2tfbs <- read.table("results/snp2tfbs/match_output_20913.txt", header = FALSE)
fetal_snp2tfbs <- snp2tfbs[snp2tfbs$V7 %in% fetal_eqtls, ]
adult_snp2tfbs <- snp2tfbs[snp2tfbs$V7 %in% adult_eqtls, ]
shared_snp2tfbs <- snp2tfbs[snp2tfbs$V7 %in% shared_eqtls, ]
# split the V6 column by ";"
fetal_snp2tfbs <- separate(fetal_snp2tfbs, col=V6, into = c("MATCH", "TF", "ScoreDiff"), sep = ";")
adult_snp2tfbs <- separate(adult_snp2tfbs, col=V6, into = c("MATCH", "TF", "ScoreDiff"), sep = ";")
shared_snp2tfbs <- separate(shared_snp2tfbs, col=V6, into = c("MATCH", "TF", "ScoreDiff"), sep = ";")
# create a dataframe with number of affected TFBSs by fetal, adult or shared eqtls
snp2tfbs.df <- data.frame(
eqtl_db = rep(c('Adult', 'Fetal', 'Shared'), each=5),
tfbs = rep(1:5, 3),
snps = c(nrow(adult_snp2tfbs[adult_snp2tfbs$MATCH=="MATCH=1",]),
nrow(adult_snp2tfbs[adult_snp2tfbs$MATCH=="MATCH=2",]),
nrow(adult_snp2tfbs[adult_snp2tfbs$MATCH=="MATCH=3",]),
nrow(adult_snp2tfbs[adult_snp2tfbs$MATCH=="MATCH=4",]),
nrow(adult_snp2tfbs[adult_snp2tfbs$MATCH=="MATCH=5",]),
nrow(fetal_snp2tfbs[fetal_snp2tfbs$MATCH=="MATCH=1",]),
nrow(fetal_snp2tfbs[fetal_snp2tfbs$MATCH=="MATCH=2",]),
nrow(fetal_snp2tfbs[fetal_snp2tfbs$MATCH=="MATCH=3",]),
nrow(fetal_snp2tfbs[fetal_snp2tfbs$MATCH=="MATCH=4",]),
nrow(fetal_snp2tfbs[fetal_snp2tfbs$MATCH=="MATCH=5",]),
nrow(shared_snp2tfbs[shared_snp2tfbs$MATCH=="MATCH=1",]),
nrow(shared_snp2tfbs[shared_snp2tfbs$MATCH=="MATCH=2",]),
nrow(shared_snp2tfbs[shared_snp2tfbs$MATCH=="MATCH=3",]),
nrow(shared_snp2tfbs[shared_snp2tfbs$MATCH=="MATCH=4",]),
nrow(shared_snp2tfbs[shared_snp2tfbs$MATCH=="MATCH=5",])))
#pdf("figures/snp2tfbs.pdf", width = 10, height = 6)
ggplot(data = snp2tfbs.df, aes(x=tfbs, y=snps, fill=eqtl_db)) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
legend.position = "none",
axis.text=element_text(size=20, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 18, colour = "black")) +
labs(x = "Number of TFBS affected", y = "Number of SNPs", color=" ") +
scale_fill_manual(values=c("#92278F" ,"#009444", "darkgray")) +
facet_grid(.~eqtl_db)
#dev.off()
Enrichment within active regulatory elements and histone modification marks was identified using the Roadmap Epigenomics Project 15-state ChromHMM models for adult dorsolateral prefrontal cortex (E073_15_coreMarks_hg38lift_mnemonics.bed.gz) and fetal brain (E081_15_coreMarks_hg38lift_mnemonics.bed.gz) downloaded from here.
We used Bedtools to intersect chromHMM models with fetal and adult eQTLs and non-eQTLs:
Intersection between chromHMM adult cortex model and adult cortex eQTLs: bedtools intersect -wb -a data/E073_15_coreMarks_hg38lift_mnemonics.bed -b results/chromHMM/aa-cortex_eqtl_snps_chrrenamed.bed > results/chromHMM/aa-cortex_eqtl_snps_E073.bed Intersection between chromHMM adult cortex model and adult cortex non-eQTLs: bedtools intersect -wb -a data/E073_15_coreMarks_hg38lift_mnemonics.bed -b results/chromHMM/aa-cortex_noneqtl_snps_chrrenamed.bed > results/chromHMM/aa-cortex_noneqtl_snps_E073.bed Intersection between chromHMM fetal cortex model and fetal cortex eQTLs: bedtools intersect -wb -a data/E081_15_coreMarks_hg38lift_mnemonics.bed -b results/chromHMM/ff-cortex_eqtl_snps_chrrenamed.bed > results/chromHMM/ff-cortex_eqtl_snps_E081.bed Intersection between chromHMM fetal cortex model and fetal cortex non-eQTLs: bedtools intersect -wb -a data/E081_15_coreMarks_hg38lift_mnemonics.bed -b results/chromHMM/ff-cortex_noneqtl_snps_chrrenamed.bed > results/chromHMM/ff-cortex_noneqtl_snps_E081.bed
# loading ChromHMM files
f_chromhmm <- read.table(gzfile("data/E081_15_coreMarks_hg38lift_mnemonics.bed.gz"))
a_chromhmm <- read.table(gzfile("data/E073_15_coreMarks_hg38lift_mnemonics.bed.gz"))
# loading fetal and adult snps
f_eqtls <- read.table("results/chromHMM/ff-cortex_eqtl_snps_E081.bed")
f_eqtls <- distinct(f_eqtls[, c("V4", "V8")]) # 80
f_noneqtls <- read.table("results/chromHMM/ff-cortex_noneqtl_snps_E081.bed")
f_noneqtls <- distinct(f_noneqtls[, c("V4", "V8")]) # 264
a_eqtls <- read.table("results/chromHMM/aa-cortex_eqtl_snps_E073.bed")
a_eqtls <- distinct(a_eqtls[, c("V4", "V8")]) # 58
a_noneqtls <- read.table("results/chromHMM/aa-cortex_noneqtl_snps_E073.bed")
a_noneqtls <- distinct(a_noneqtls[, c("V4", "V8")]) # 286
x_order <- c('Quies', 'TxWk', 'ReprPCWk', 'ReprPC', 'Enh', 'Tx', 'Het', 'TssA', 'TssAFlnk')
chromHMM.df <- data.frame(
chromhmm_state = rep(x_order, each=2),
eqtl_db = rep(c('Adult', 'Fetal'), 9),
snps = c(nrow(a_eqtls[a_eqtls$V4=='15_Quies',]), nrow(f_eqtls[f_eqtls$V4=='15_Quies',]),
nrow(a_eqtls[a_eqtls$V4=='5_TxWk',]), nrow(f_eqtls[f_eqtls$V4=='5_TxWk',]),
nrow(a_eqtls[a_eqtls$V4=='14_ReprPCWk',]), nrow(f_eqtls[f_eqtls$V4=='14_ReprPCWk',]),
nrow(a_eqtls[a_eqtls$V4=='13_ReprPC',]), nrow(f_eqtls[f_eqtls$V4=='13_ReprPC',]),
nrow(a_eqtls[a_eqtls$V4=='7_Enh',]), nrow(f_eqtls[f_eqtls$V4=='7_Enh',]),
nrow(a_eqtls[a_eqtls$V4=='4_Tx',]), nrow(f_eqtls[f_eqtls$V4=='4_Tx',]),
nrow(a_eqtls[a_eqtls$V4=='9_Het',]), nrow(f_eqtls[f_eqtls$V4=='9_Het',]),
nrow(a_eqtls[a_eqtls$V4=='1_TssA',]), nrow(f_eqtls[f_eqtls$V4=='1_TssA',]),
nrow(a_eqtls[a_eqtls$V4=='2_TssAFlnk',]), nrow(f_eqtls[f_eqtls$V4=='2_TssAFlnk',])),
percent = c(round(nrow(a_eqtls[a_eqtls$V4=='15_Quies',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='15_Quies',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='5_TxWk',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='5_TxWk',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='14_ReprPCWk',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='14_ReprPCWk',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='13_ReprPC',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='13_ReprPC',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='7_Enh',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='7_Enh',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='4_Tx',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='4_Tx',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='9_Het',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='9_Het',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='1_TssA',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='1_TssA',])/nrow(f_eqtls)*100, 2),
round(nrow(a_eqtls[a_eqtls$V4=='2_TssAFlnk',])/nrow(a_eqtls)*100, 2),
round(nrow(f_eqtls[f_eqtls$V4=='2_TssAFlnk',])/nrow(f_eqtls)*100, 2)))
#pdf("figures/chromHMM.pdf", width = 9, height = 6)
ggplot(chromHMM.df, aes(x = factor(chromhmm_state, level=x_order), y = percent, fill = eqtl_db)) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
legend.text=element_text(size=16),
legend.title=element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
axis.text=element_text(size=16, colour = "black"),
axis.text.x=element_text(angle = 20, vjust = 0.5, hjust=0.4),
axis.title=element_text(size=21, colour = "black")) +
labs(x = "ChromHMM states", y = "Percentage of eQTL SNPs") +
geom_text(aes(y = percent, label = paste0("n=", snps)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 4, fontface = 'italic') +
scale_fill_manual(values=c("#92278F", "#009444"))
#dev.off()
The majority of the fetal ASD-associated eQTLs located within weakly repressed PolyComb (ReprPCWk) and repressed PolyComb (ReprPC) regions were not identified as being eQTLs within the adult cortex.
f_ReprPC <- f_eqtls[f_eqtls$V4=='13_ReprPC', ] # 10 fetal ReprPC eQTLs
#a_eqtls[a_eqtls$V8 %in% f_ReprPC$V8, ] # 1 eQTL-Quies
#a_noneqtls[a_noneqtls$V8 %in% f_ReprPC$V8, ] # 4 noneQTL-ReprPCWk, 3 noneQTL-Quies, 1 noneQTL-ReprPC and 1 noneQTL-Enh
x_order <- c('eQTL-Quies', 'noneQTL-ReprPCWk', 'noneQTL-Quies', 'noneQTL-ReprPC', 'noneQTL-Enh')
reprPC.df <- data.frame(
a_chromhmm_state = x_order,
snps = c(1, 4, 3, 1, 1))
#pdf("figures/reprPC_chromHMM.pdf", width = 9, height = 6)
ggplot(reprPC.df, aes(x = factor(a_chromhmm_state, level=x_order), y = snps)) +
geom_bar(stat="identity", position = "dodge", fill = "#92278F") +
theme_classic() +
theme(plot.title = element_blank(),
legend.text=element_text(size=16),
legend.title=element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
axis.text=element_text(size=16, colour = "black"),
axis.text.x=element_text(angle = 20, vjust = 0.5, hjust=0.4),
axis.title=element_text(size=21, colour = "black")) +
labs(x = "ChromHMM states in adult cortex", y = "Number of ReprPC eQTL SNPs\n in fetal cortex") +
geom_text(aes(y = snps, label = paste0("n=", snps)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 7, fontface = 'italic')
#dev.off()
f_ReprPCWk <- f_eqtls[f_eqtls$V4=='14_ReprPCWk',]
#a_eqtls[a_eqtls$V8 %in% f_ReprPCWk$V8, ] # 4 eQTL-Quies
#a_noneqtls[a_noneqtls$V8 %in% f_ReprPCWk$V8, ] # 9 noneQTL-Quies and 1 noneQTL-ReprPCWk
x_order <- c('eQTL-Quies', 'noneQTL-Quies', 'noneQTL-ReprPCWk')
reprPCWk.df <- data.frame(
a_chromhmm_state = x_order,
snps = c(4, 9, 1))
#options(digits=0)
#pdf("figures/reprPCWk_chromHMM.pdf", width = 9, height = 6)
ggplot(reprPCWk.df, aes(x = factor(a_chromhmm_state, level=x_order), y = snps)) +
geom_bar(stat="identity", position = "dodge", fill = "#92278F") +
theme_classic() +
theme(plot.title = element_blank(),
legend.text=element_text(size=16),
legend.title=element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
axis.text=element_text(size=16, colour = "black"),
axis.text.x=element_text(angle = 20, vjust = 0.5, hjust=0.4),
axis.title=element_text(size=21, colour = "black")) +
labs(x = "ChromHMM states in adult cortex",
y = "Number of ReprPCWk eQTL SNPs\n in fetal cortex") +
geom_text(aes(y = snps, label = paste0("n=", snps)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 7, fontface = 'italic') +
scale_y_continuous(limits=c(0, 10))
#dev.off()
ChromHMM enrichment analysis was performed using R package regioneR (permutation test: 1000).
# loading ChromHMM files
f <- gzfile("data/E081_15_coreMarks_hg38lift_mnemonics.bed.gz")
f_chromhmm <- import(f, format = "BED")
a <- gzfile("data/E073_15_coreMarks_hg38lift_mnemonics.bed.gz")
a_chromhmm <- import(a, format = "BED")
f <- read.table("results/chromHMM/ff-cortex_eqtl_snps_E081.bed")
f <- distinct(f[, c("V1", "V2", "V3", "V8")])
f_eqtls <- makeGRangesFromDataFrame(f, seqnames.field = "V1", start.field = "V2", end.field = "V3",
ignore.strand = TRUE, keep.extra.columns = TRUE)
numOverlaps(f_eqtls, f_chromhmm, genome="hg38", count.once=TRUE) # 80
## [1] 80
# permute the ChromHMM regions
set.seed(1234)
numOverlaps(f_eqtls, randomizeRegions(f_chromhmm), genome="hg38", count.once=TRUE) # 43
## [1] 43
# test if eQTLs overlap with ChromHMM more than expected; 1000 permutations
f_pt <- overlapPermTest(A=f_eqtls, B=f_chromhmm, ntimes=1000, genome="hg38",
mc.set.seed=FALSE, force.parallel=TRUE, count.once=TRUE)
f_pt # P-value: 0.000999000999000999. The test was significant, with a p-value < 0.001.
## $numOverlaps
## P-value: 0.000999000999000999
## Z-score: 3.3602
## Number of iterations: 1000
## Alternative: greater
## Evaluation of the original region set: 80
## Evaluation function: numOverlaps
## Randomization function: randomizeRegions
##
## attr(,"class")
## [1] "permTestResultsList"
#plot(f_pt)
# test if eQTLs overlap with ChromHMM ReprPCWk regions more than expected; 1000 permutations
f_ReprPCWk_reg <- f_chromhmm[f_chromhmm@elementMetadata@listData[["name"]]=="14_ReprPCWk"]
set.seed(1234)
f_pt <- overlapPermTest(A=f_eqtls, B=f_ReprPCWk_reg, ntimes=1000, genome="hg38",
mc.set.seed=FALSE, force.parallel=TRUE, count.once=TRUE)
f_pt # P-value: 0.000999000999000999 The test was significant, with a p-value < 0.001.
## $numOverlaps
## P-value: 0.000999000999000999
## Z-score: 3.1126
## Number of iterations: 1000
## Alternative: greater
## Evaluation of the original region set: 14
## Evaluation function: numOverlaps
## Randomization function: randomizeRegions
##
## attr(,"class")
## [1] "permTestResultsList"
#plot(f_pt)
# test if eQTLs overlap with ChromHMM ReprPC regions more than expected; 1000 permutations
f_ReprPC_reg <- f_chromhmm[f_chromhmm@elementMetadata@listData[["name"]]=="13_ReprPC"]
set.seed(1234)
f_pt <- overlapPermTest(A=f_eqtls, B=f_ReprPC_reg, ntimes=1000, genome="hg38",
mc.set.seed=FALSE, force.parallel=TRUE, count.once=TRUE)
f_pt # P-value: 0.000999000999000999 The test was significant, with a p-value < 0.001.
## $numOverlaps
## P-value: 0.000999000999000999
## Z-score: 7.5306
## Number of iterations: 1000
## Alternative: greater
## Evaluation of the original region set: 10
## Evaluation function: numOverlaps
## Randomization function: randomizeRegions
##
## attr(,"class")
## [1] "permTestResultsList"
#plot(f_pt)
a <- read.table("results/chromHMM/aa-cortex_eqtl_snps_E073.bed")
a <- distinct(a[, c("V1", "V2", "V3", "V8")])
a_eqtls <- makeGRangesFromDataFrame(a, seqnames.field = "V1", start.field = "V2", end.field = "V3",
ignore.strand = TRUE, keep.extra.columns = TRUE)
numOverlaps(a_eqtls, a_chromhmm, count.once=TRUE) # should be 58
## [1] 58
# permute the ChromHMM regions
set.seed(1234)
numOverlaps(a_eqtls, randomizeRegions(a_chromhmm), genome="hg38", count.once=TRUE) # 43
## [1] 30
# test if eQTLs overlap with ChromHMM more than expected; 1000 permutations
a_pt <- overlapPermTest(A=a_eqtls, B=a_chromhmm, ntimes=1000, genome="hg38",
mc.set.seed=FALSE, force.parallel=TRUE, count.once=TRUE)
a_pt # P-value: 0.002997002997003. The test was significant, with a p-value < 0.005.
## $numOverlaps
## P-value: 0.002997002997003
## Z-score: 2.7766
## Number of iterations: 1000
## Alternative: greater
## Evaluation of the original region set: 58
## Evaluation function: numOverlaps
## Randomization function: randomizeRegions
##
## attr(,"class")
## [1] "permTestResultsList"
#plot(a_pt)
Finally, we evaluated identified eQTL associations with other phenotypes in the GWAS Catalog.
# loading gwas associations
gwas <- gzfile("data/gwas_catalog_v1.0.2-associations_e100_r2020-08-26.tsv.gz",'rt')
gwas_assoc <- read.delim(gwas, header = TRUE, quote= "") # 197,439 associations
# subsetting
#gwas_assoc_sub <- subset(gwas_assoc, select=c(CHR_ID, SNPS, CONTEXT, P.VALUE, MAPPED_TRAIT))
gwas_assoc_sub <- subset(gwas_assoc, select=c(SNPS, P.VALUE, MAPPED_TRAIT))
colnames(gwas_assoc_sub) <- c("snp", "p_value", "mapped_trait")
gwas_assoc_sub <- subset(gwas_assoc_sub, p_value < 0.00000005)
rs <- grep("^rs", gwas_assoc_sub$snp) # removing SNPs that don't have rs IDs
gwas_assoc_sub <- subset(gwas_assoc_sub, snp %in% snp[rs])
x <- grep(" x ", gwas_assoc_sub$snp) # removing SNPs that have "x" in IDs
gwas_assoc_sub <- subset(gwas_assoc_sub, !(snp %in% snp[x]))
y <- grep("\\-", gwas_assoc_sub$snp) # removing SNPs that have "-" in IDs
gwas_assoc_sub <- subset(gwas_assoc_sub, !(snp %in% snp[y]))
y <- grep("\\_", gwas_assoc_sub$snp) # removing SNPs that have "_" in IDs
gwas_assoc_sub <- subset(gwas_assoc_sub, !(snp %in% snp[y]))
y <- grep("\\*", gwas_assoc_sub$snp) # removing SNPs that have "*" in IDs
gwas_assoc_sub <- subset(gwas_assoc_sub, !(snp %in% snp[y]))
y <- grep("\\/", gwas_assoc_sub$snp) # removing SNPs that have "/" in IDs
gwas_assoc_sub <- subset(gwas_assoc_sub, !(snp %in% snp[y]))
y <- c("rs12524487 ", "rs2428362†")
gwas_assoc_sub <- subset(gwas_assoc_sub, !(snp %in% snp[y])) # removing wierdos
gwas_assoc_uniq <- distinct(gwas_assoc_sub) # getting only unique associations
#write.table(gwas_assoc_uniq, file = "results/gwas/gwas_5E-08.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
We used python scripts/fix_snps.py to split the lines with snps separated by “;”. We run python scripts/fix_traits.py to split the mapped traits by “,” to preserve single snp - single trait relationship. Next, we run python scripts/fix_ids.py to remap merged old rsIDs into a new rsIDs. The RsMergeArch.bcp.gz file with rs merge table for genome GRCh38p7 build 151 was downloaded from the dbSNP ftp site on 27/08/2020.
gwas_st_pairs <- read.table("results/gwas/gwas_5E-08_snpsfixed_traitfixed_idfixed.txt", sep = "\t", header = TRUE, quote= "") # 149,096 gwas snp-trait pairs
gwas_st_pairs_uniq <- subset(gwas_st_pairs, select=c(snp.1, mapped_trait)) %>% distinct() # 111,172 unique snp-trait pairs
#writeLines(gwas_st_pairs_uniq$snp.1, con = "results/gwas/gwas_5E-08_snps.txt", sep = "\n")
To get genomic positions for GWAS SNPs according to genome GRCh38p7 build 151, we downloaded dbSNP bed files for each chromosome from dbSNP ftp site, concatenated them and run bash scripts/rsID2Bed.sh results/gwas/gwas_5E-08_snps.txt
colnames(gwas_st_pairs_uniq) <- c("snp", "mapped_trait")
gwas_st_pairs_uniq_pos <- read.table("results/gwas/rsID2Bed/gwas_5E-08_snps.txt.bed", sep = "\t", header = FALSE, quote= "")
colnames(gwas_st_pairs_uniq_pos) <- c("chr", "start", "end", "snp", "score", "strand")
# merge two tables by snp rsID
gwas_st_pairs_uniq_pos <- distinct(gwas_st_pairs_uniq_pos)
merged <- merge(gwas_st_pairs_uniq, gwas_st_pairs_uniq_pos, by = "snp")
We overlapped GWAS SNPs with fetal and adult eQTLs to identify how many eQTLs have an association with GWAS traits.
f_assoc <- merged[merged$snp %in% fetal_eqtls, ]
a_assoc <- merged[merged$snp %in% adult_eqtls, ]
f_melt <- melt(f_assoc, id="snp", measure.vars = "mapped_trait")
f_traits <- dcast(f_melt, value ~ variable) # 37 GWAS traits
a_melt <- melt(a_assoc, id="snp", measure.vars = "mapped_trait")
a_traits <- dcast(a_melt, value ~ variable) # 28 GWAS traits
# Adult eQTLs
a_order <- c("autism spectrum disorder ", "schizophrenia ", "unipolar depression ", "attention deficit hyperactivity disorder ", "bipolar disorder ", "anorexia nervosa ", "obsessive-compulsive disorder ", "Tourette syndrome ", "intelligence ", "self reported educational attainment ")
adult_eqtls.df <- data.frame(
gwas = a_order,
number = c(58, 57, 26, 24, 24, 21, 21, 21, 3, 3))
#pdf("figures/adult_gwas_assoc_top10.pdf", width = 10, height = 7)
ggplot(adult_eqtls.df, aes(x = factor(gwas, level=rev(a_order)), y = number, fill = "#92278F")) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
axis.text=element_text(size=14, colour = "black"),
axis.title=element_text(size=16, colour = "black")) +
scale_fill_manual(values = "#92278F") +
geom_text(aes(y = number, label = paste0("n=", number)), position=position_dodge(width=0.9),
vjust = 0.25, hjust = -0.25, color = "black", size = 5) +
scale_y_continuous(limits = c(0, 60)) +
ylab("Number of eQTL SNPs") +
coord_flip()
#dev.off()
# Fetal eQTLs
f_order <- c("autism spectrum disorder ", "schizophrenia ", "unipolar depression ", "attention deficit hyperactivity disorder ", "bipolar disorder ", "anorexia nervosa ", "obsessive-compulsive disorder ", "Tourette syndrome ", "intelligence ", "self reported educational attainment ")
fetal_eqtls.df <- data.frame(
gwas = f_order,
number = c(79, 78, 32, 30, 30, 24, 24, 24, 5, 4))
#pdf("figures/fetal_gwas_assoc_top10.pdf", width = 10, height = 7)
ggplot(fetal_eqtls.df, aes(x = factor(gwas, level=rev(f_order)), y = number, fill = "#009444")) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
axis.text=element_text(size=14, colour = "black"),
axis.title=element_text(size=16, colour = "black")) +
scale_fill_manual(values = "#009444") +
geom_text(aes(y = number, label = paste0("n=", number)), position=position_dodge(width=0.9),
vjust = 0.25, hjust = -0.25, color = "black", size = 5) +
scale_y_continuous(limits = c(0, 85)) +
ylab("Number of eQTL SNPs") +
coord_flip()
#dev.off()
Bootstrapping analysis (n=10,000 iterations) was performed to test if observed overlaps were non-random. These overlaps were statistically significant (bootstrapping, p < 0.01, n=10,000).
# Bootrstrapping function accepts the two vectors of phenotype-specific snps, the reference set of snps (e.g. all GWAS snps), the actual observed overlap, and the number of bootstraps to perform (usually 10,000). It returns a bootstrapping p value for the overlap.
snps_bootstrapping <- function(input_1, input_2, reference_snps, act_overlap, num) {
# resample snps from the reference set
resampled_1 <- lapply(1:num, function(i) sample(reference_snps, length(input_1),
replace=TRUE))
resampled_2 <- lapply(1:num, function(i) sample(reference_snps, length(input_2),
replace=TRUE))
# get number of overlaps between new sets of snps
overlap_num <- mapply(function(x, y) {
intersect(x, y)
}, x=resampled_1, y=resampled_2)
# count instances that have overlaps more or equal to actual overlap
actual_overlaps <- c()
for (i in 1:length(overlap_num)){
if (length(overlap_num[[i]]) >= act_overlap){
actual_overlaps <- append(actual_overlaps, length(overlap_num[[i]]))
}
}
# calculate bootstrapping p value
p_value <- sum(actual_overlaps)/num
p_value
}
# Testing
#input_1 <- LETTERS[1:5]
#input_2 <- LETTERS[1:2]; input_2 <- append(input_2,rev(LETTERS)[1:3])
#reference_snps <- LETTERS
#act_overlap <- 1
#num <- 10
#snps_bootstrapping(input_1, input_2, reference_snps, act_overlap, num)
# Bootstrapping analysis between ASD and SCZ snps
fetal_eqtls <- unique(fetal$snp) # 80
adult_eqtls <- unique(adult$snp) # 58
gwas_snps <- unique(merged$snp) # 78166
scz <- merged[merged$mapped_trait == "schizophrenia",] # 1224
scz_snps <- unique(scz$snp) # should be 1224
options(digits=3)
set.seed(1234)
# ASD and SCZ (78 snps in fetal and 57 snp sin adult cortical tissues)
snps_bootstrapping(fetal_eqtls, scz_snps, gwas_snps, 78, 10000) # should be 0
## [1] 0
snps_bootstrapping(adult_eqtls, scz_snps, gwas_snps, 57, 10000) # should be 0
## [1] 0
To identify existing and novel gene associations, we intersected our lists of genes, from fetal and adult cortical tissues, with a curated list of 1,237 genes that had been previously implicated in autism development (AutDB, accessed on 16/11/2020).
# loading AutDB genes
AutDB <- read.csv("data/AutDB_gene-report.csv")
AutDB_genes <- unique(AutDB$Gene.Symbol) # 1237 AutDB genes
# intersecting AutDB genes with genes regulated in fetal and adult cortex
fetal_egenes <- unique(fetal$gene) # 81
adult_egenes <- unique(adult$gene) # 44
f_autdb <- intersect(AutDB_genes, fetal_egenes) # 8
a_autdb <- intersect(AutDB_genes, adult_egenes) # 7
linked_to_asd <- unique(c(f_autdb, a_autdb)) # 11
We identified 11 genes (8 in fetal and 7 in adult cortical tissues) that had been previously linked to ASD. Bootstrapping analysis revealed that these overlaps are significant (p < 0.01, n=10,000).
# Bootrstrapping function accepts the number of egenes-associated with ASD, the reference set 1 of egenes (e.g. all egenes), the reference set 2 (AutDB genes), the actual observed overlap, and the number of bootstraps to perform (usually 10,000). It returns a bootstrapping p value for the overlap.
egenes_bootstrapping <- function(in1, ref_1, ref_2, act_overlap, num) {
# resample snps from the reference set
resampled_1 <- lapply(1:num, function(i) sample(ref_1, in1,
replace=TRUE))
# make 10000 bootstraps of AutDB
resampled_2 <- rep(list(ref_2), num)
# get number of overlaps between new sets of snps
overlap_num <- mapply(function(x, y) {
intersect(x, y)
}, x=resampled_1, y=resampled_2)
# count instances that have overlaps more or equal to actual overlap
actual_overlaps <- c()
for (i in 1:length(overlap_num)){
if (length(overlap_num[[i]]) >= act_overlap){
actual_overlaps <- append(actual_overlaps, length(overlap_num[[i]]))
}
}
# calculate bootstrapping p value
p_value <- sum(actual_overlaps)/num
p_value
}
# Testing
#input_1 <- LETTERS[1:5]; reference1 <- LETTERS; reference2 <- rev(LETTERS)[1:3]
#act_overlap <- 1; num <- 10
#egenes_bootstrapping(5, reference1, reference2, act_overlap, num)
# Bootstrapping analysis between AutDB and ASD genes
all_genes.df <- read.table("data/adult_brain.gene_tpm_median.txt") # the same with fetal
all_genes <- unique(all_genes.df$V2) # should be 54592
options(digits=3)
set.seed(1234)
# Fetal genes and AutDB
egenes_bootstrapping(length(fetal_egenes), all_genes, AutDB_genes, length(f_autdb), 10000)
## [1] 0.0042
# Adult genes and AutDB
egenes_bootstrapping(length(adult_egenes), all_genes, AutDB_genes, length(a_autdb), 10000)
## [1] 0
# read significant interactions in fetal and adult cortical tissues
fetal <- read.table("results/codes3d/fetal_cortex/significant_eqtls.txt", header = TRUE, sep = "\t")
adult <- read.table("results/codes3d/adult_cortex/significant_eqtls.txt", header = TRUE, sep = "\t")
# read gnomad file
gnomad <- read.table(gzfile('data/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz','rt'), header = TRUE,
sep="\t")
gene_list <- fetal[, c("gene", "interaction_type", "snp", "log2_aFC")]
gnomad <- gnomad[, c("gene", "pLI")]
gnomad_pheno <- gnomad %>%
mutate(intolerant=ifelse(pLI>=0.9, "Intolerant", "Tolerant")) %>%
mutate(intolerant=ifelse(is.na(intolerant),"nopli",intolerant)) %>%
full_join(gene_list) %>%
mutate(interaction_type=ifelse(is.na(interaction_type),"non-eQTL",interaction_type)) %>%
mutate(interaction_type=factor(interaction_type,
levels=c("non-eQTL","Cis","Trans-intrachromosomal",
"Trans-interchromosomal")))
#write.table(gnomad_pheno, file = "results/LoF/fetal_LoF.txt", sep = "\t", col.names = TRUE,
# row.names=FALSE)
gene_list <- adult[, c("gene", "interaction_type", "snp", "log2_aFC")]
gnomad <- gnomad[, c("gene", "pLI")]
gnomad_pheno <- gnomad %>%
mutate(intolerant=ifelse(pLI>=0.9, "Intolerant", "Tolerant")) %>%
mutate(intolerant=ifelse(is.na(intolerant),"nopli",intolerant)) %>%
full_join(gene_list) %>%
mutate(interaction_type=ifelse(is.na(interaction_type),"non-eQTL",interaction_type)) %>%
mutate(interaction_type=factor(interaction_type,
levels=c("non-eQTL","Cis","Trans-intrachromosomal",
"Trans-interchromosomal")))
#write.table(gnomad_pheno, file = "results/LoF/adult_LoF.txt", sep = "\t", col.names = TRUE,
# row.names=FALSE)
GO analysis was performed using the g:GOSt module of the g:Profiler tool. The significance level was determined using Benjamini-Hochberg algorithm (FDR < 0.05).
# This function quieries g:GOSt module of the g:Profiler tool. It takes a vector of genes and quieries the GOSt module. It outputs the dataframe with the query results for the genes.
query_go <- function(genes){
tryCatch({
t <- gost(query = genes, organism = "hsapiens", ordered_query = TRUE,
multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
measure_underrepresentation = FALSE, evcodes = TRUE,
user_threshold = 0.05, correction_method = "fdr",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = "GO", as_short_link = FALSE)
return(t[["result"]])
}, error=function(e){
cat("ERROR: ", conditionMessage(e), "\n")
})
}
#a_go <- query_go(adult_egenes); a_gos.df <- apply(a_go, 2, as.character)
#write.table(a_gos.df, file = "results/go/a_all_gos_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
a_gos.df <- read.table("results/go/a_all_gos_fdr.txt", header = TRUE, sep="\t")
a_bp <- a_gos.df[grep('GO:BP', a_gos.df[, "source"]), ] # extracting only the 'GO:BP' terms
a_bp_top10 <- as.data.frame(a_bp[1:10,])
#pdf("figures/a_bp_top10.pdf", width = 11, height = 7)
ggplot(a_bp_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#92278F")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#92278F")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
a_mf <- a_gos.df[grep('GO:MF', a_gos.df[, "source"]), ] # extracting only the 'GO:MF' terms
a_mf_top10 <- as.data.frame(a_mf[1:10,])
#pdf("figures/a_mf_top10.pdf", width = 9, height = 7)
ggplot(a_mf_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#92278F")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#92278F")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
a_cc <- a_gos.df[grep('GO:CC', a_gos.df[, "source"]), ] # extracting only the 'GO:CC' terms
a_cc_top10 <- as.data.frame(a_cc[1:10,])
#pdf("figures/a_cc_top10.pdf", width = 8, height = 7)
ggplot(a_cc_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#92278F")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#92278F")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
#f_go <- query_go(fetal_egenes); f_gos.df <- apply(f_go, 2, as.character)
#write.table(f_gos.df, file = "results/go/f_all_gos_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_gos.df <- read.table("results/go/f_all_gos_fdr.txt", header = TRUE, sep="\t")
f_bp <- f_gos.df[grep('GO:BP', f_gos.df[, "source"]), ]
f_bp_top10 <- as.data.frame(f_bp[1:10,])
#pdf("figures/f_bp_top10.pdf", width = 9, height = 7)
ggplot(f_bp_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#009444")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#009444")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
f_mf <- f_gos.df[grep('GO:MF', f_gos.df[, "source"]), ]
f_mf_top10 <- as.data.frame(f_mf[1:10,])
x_factor = sort(f_mf_top10[, "term_name"])
x_factor = c("acyl-CoA desaturase activity", "arsenite methyltransferase activity",
"linoleoyl-CoA desaturase activity", "methylarsonite methyltransferase activity",
"MHC class II protein complex binding", "MHC protein complex binding",
"oxidoreductase activity",
"peptide antigen binding", "TAP binding", "TAP1 binding")
# We replaced NA with "oxidoreductase activity..."
#pdf("figures/f_mf_top10.pdf", width = 6, height = 7)
ggplot(f_mf_top10, aes(x=factor(term_name, levels = rev(x_factor)),
y=-log10(as.numeric(p_value)), fill="#009444")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#009444")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
f_cc <- f_gos.df[grep('GO:CC', f_gos.df[, "source"]), ]
f_cc_top10 <- as.data.frame(f_cc[1:10,])
#pdf("figures/f_cc_top10.pdf", width = 9, height = 7)
ggplot(f_cc_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#009444")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#009444")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
Biological processes for fetal and adult cortex-specific genes associated with the ASD-associated eQTLs, after removing all HLA genes from the lists.
#a_egenes_nohla <- readLines("results/go/adult_egenes_nohla.txt")
#a_go <- query_go(a_egenes_nohla); a_gos.df <- apply(a_go, 2, as.character)
#write.table(a_gos.df, file = "results/go/a_nohla_all_gos_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
a_gos.df <- read.table("results/go/a_nohla_all_gos_fdr.txt", header = TRUE, sep="\t")
a_bp <- a_gos.df[grep('GO:BP', a_gos.df[, "source"]), ]
#write.table(a_bp, file = "results/go/a_bp_nohla_all_gos_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
#f_egenes_nohla <- readLines("results/go/fetal_egenes_nohla.txt")
#f_go <- query_go(f_egenes_nohla); f_gos.df <- apply(f_go, 2, as.character)
#write.table(f_gos.df, file = "results/go/f_nohla_all_gos_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_gos.df <- read.table("results/go/f_nohla_all_gos_fdr.txt", header = TRUE, sep="\t")
f_bp <- f_gos.df[grep('GO:BP', f_gos.df[, "source"]), ]
#write.table(f_bp, file = "results/go/f_bp_nohla_all_gos_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
The STRING PPI network (version 11.0, protein.links.full.v11.0.txt.gz) was downloaded from STRING website. We extracted only human PPIs with a combined score ≥ 400 between proteins: zgrep ^"9606\." protein.links.full.v11.0.txt.gz | awk '($16 >= 400) { print $1, $2, $16 }' > human_PPIs.full.v11.0_400.txt
Made the file tab separated: tr ' ' '\t' < human_PPIs.full.v11.0_400.txt > tab_human_PPIs.full.v11.0_400.txt
And compressed it: gzip tab_human_PPIs.full.v11.0_400.txt
# loading human PPI network
PPI_400 <- read.table(gzfile("results/string/tab_human_PPIs.full.v11.0_400.txt.gz"), header = FALSE, sep="\t")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## number of items read is not a multiple of the number of columns
# removing all characters before and up to "."
PPI_400$V1 <- gsub(".*\\.", "", PPI_400$V1); PPI_400$V2 <- gsub(".*\\.", "", PPI_400$V2)
PPI_400 <- distinct(PPI_400) # 2,000,451 PPIs
#write.table(PPI_400, file = "results/string/PPI_400.txt", sep = "\t", col.names = FALSE, row.names=FALSE)
# extracting unique proteins
unique_proteins <- unique(c(PPI_400$V1, PPI_400$V2)) # 19,258
Overall, we extracted 2e+06 PPIs between a total of 19258 unique human proteins. Transcript identifiers (adult and fetal cortex RNA-seq data) were mapped to Ensembl gene identifiers.
# reading fetal and adult gene expression data (gene TPM counts)
fetal_gene_exp <- read.table("data/fetal_brain.gene_tpm_median.txt")
adult_gene_exp <- read.table("data/adult_brain.gene_tpm_median.txt")
# extractiong only genes that have no missing values and minimum expression level > 0 TPM
f_gene_exp <- subset(fetal_gene_exp, fetal_gene_exp$V3 > 0) # 33,710 expressed genes in fetal cortex
a_gene_exp <- subset(adult_gene_exp, adult_gene_exp$V3 > 0) # 29,560 expressed genes in adult cortex
colnames(f_gene_exp) <- c("gene_id", "gene_name", "tpm")
colnames(a_gene_exp) <- c("gene_id", "gene_name", "tpm")
# converting transcript IDs to gene IDs
f_gene_exp$gene_id <- gsub("\\..*","", f_gene_exp$gene_id)
a_gene_exp$gene_id <- gsub("\\..*","", a_gene_exp$gene_id)
# querying protein features
edb <- EnsDb.Hsapiens.v86
#hasProteinData(edb) # TRUE
#listTables(edb)
# getting protein information for gene IDs
f_protein_ids <- genes(edb, filter = ~ gene_id %in% f_gene_exp$gene_id,
columns = c("protein_id", "gene_name"))
f_df <- bind_rows(f_protein_ids@elementMetadata@listData)
#write.table(f_df, file = "results/string/fetal_protein_ids.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
a_protein_ids <- genes(edb, filter = ~ gene_id %in% a_gene_exp$gene_id,
columns = c("protein_id", "gene_name"))
a_df <- bind_rows(a_protein_ids@elementMetadata@listData)
#write.table(a_df, file = "results/string/adult_protein_ids.txt", sep = "\t", col.names = FALSE, row.names=FALSE)
# getting only protein and gene IDs and removing protein-gene pairs containing NAs
f_df <- f_df[, c("protein_id", "gene_name")]
a_df <- a_df[, c("protein_id", "gene_name")]
f_df <- f_df[complete.cases(f_df), ] # 87,065 proteins
#write.table(f_df, file = "results/string/fetal_protein_gene_pairs.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_df <- read.table("results/string/fetal_protein_gene_pairs.txt", header = TRUE, sep="\t")
a_df <- a_df[complete.cases(a_df), ] # 89,492 proteins
#write.table(a_df, file = "results/string/adult_protein_gene_pairs.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
a_df <- read.table("results/string/adult_protein_gene_pairs.txt", header = TRUE, sep="\t")
Transcriptome-wide fetal and adult cortex-specific PPIs (CSPPIs) were constructed by combining the STRING PPI network with cortex-specific expression data from adult or fetal cortex datasets.
#subsetting PPIs that are present in fetal and adult cortical tissues
f_PPI_400 <- subset(PPI_400, (PPI_400$V1 %in% f_df$protein_id & PPI_400$V2 %in% f_df$protein_id))
#write.table(f_PPI_400, file = "results/string/fetal_PPI_400.txt", sep = "\t", col.names = FALSE, row.names=FALSE) # 1,690,571 PPIs
f_unique_proteins <- unique(c(f_PPI_400$V1, f_PPI_400$V2)) # 16,519
a_PPI_400 <- subset(PPI_400, (PPI_400$V1 %in% a_df$protein_id & PPI_400$V2 %in% a_df$protein_id))
#write.table(a_PPI_400, file = "results/string/adult_PPI_400.txt", sep = "\t", col.names = FALSE, row.names=FALSE) # 1,784,342 PPIs
a_unique_proteins <- unique(c(a_PPI_400$V1, a_PPI_400$V2)) # 17,156
The resulting CSPPI networks contained 1784342 PPIs between 17156 unique proteins in the adult brain, and 1690571 PPIs between 16519 unique proteins in the fetal brain. Ensembl protein (STRING) identifiers were mapped to Ensembl gene identifiers.
f_dict <- f_df$gene_name # 87,065 proteins
names(f_dict) <- f_df$protein_id
a_dict <- a_df$gene_name # 89,492 proteins
names(a_dict) <- a_df$protein_id
# This function maps gene names to protein IDs. It takes two arguments: ppi is a dataframe containing IDs for interacting proteins (first two columns: protein1 and protein2) and combined score for their interaction (the third column)
create_csppi <- function(ppi, dict){
df <- data.frame()
for(i in 1:nrow(ppi)){
cat("Analysing PPI: ", i, "\n")
tryCatch({
p1 <- ppi[i,][1]; p2 <- ppi[i,][2]; comb <- ppi[i,][3]
t <- c(dict[p1$V1], dict[p2$V2], comb$V3)
df <- rbind(df, t)
}, error=function(e){
cat("ERROR: ", conditionMessage(e), "\n")
})
}
df <- distinct(df)
colnames(df) <- c("p1", "p2", "combined_score")
return(df)
}
# Testing
#test_subset <- subset(f_PPI_400, f_PPI_400$V3>997) # 21,381
#test_f_csppi.df <- create_csppi(test_subset, f_dict)
# Uncomment the line below
#f_csppi.df <- create_csppi(f_PPI_400, f_dict) # 1,690,571 PPIs
#write.table(f_csppi.df, file = "results/string/fetal_CSPPI.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
#f_csppi.df <- read.table(gzfile("results/string/fetal_CSPPI.txt.gz"), header = TRUE, sep = "\t")
# Uncomment the line below
#a_csppi.df <- create_csppi(a_PPI_400, a_dict) # 1,784,342 PPIs
#write.table(a_csppi.df, file = "results/string/adult_CSPPI.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
#a_csppi.df <- read.table(gzfile("results/string/adult_CSPPI.txt.gz"), header = TRUE, sep = "\t")
We extracted only interactions between ASD-associated genes from the fetal and adult CSPPIs to get ASD-specific fetal and adult CSPPIs.
#subsetting PPIs that are present in fetal and adult CSPPIs
#f_asd_csppi.df <- subset(f_csppi.df, (f_csppi.df$p1 %in% fetal_egenes & f_csppi.df$p2 %in% fetal_egenes))
#write.table(f_asd_csppi.df, file = "results/string/asd_f_CSPPIs_links.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_asd_csppi.df <- read.table("results/string/asd_f_CSPPIs_links.txt", header = TRUE, sep = "\t")
f_asd_unique_proteins <- unique(c(f_asd_csppi.df$p1, f_asd_csppi.df$p2)) # 36
#a_asd_csppi.df <- subset(a_csppi.df, (a_csppi.df$p1 %in% adult_egenes & a_csppi.df$p2 %in% adult_egenes))
#write.table(f_asd_csppi.df, file = "results/string/asd_a_CSPPIs_links.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
a_asd_csppi.df <- read.table("results/string/asd_a_CSPPIs_links.txt", header = TRUE, sep = "\t")
a_asd_unique_proteins <- unique(c(a_asd_csppi.df$p1, a_asd_csppi.df$p2)) # 9
The resulting ASD-specific CSPPI networks contained 10 PPIs between 9 unique proteins in the adult brain, and 42 PPIs between 36 unique proteins in the fetal brain.
The Louvain clustering algorithm was applied to identify ASD-specific clusters of functionally related proteins within the CSPPI networks.
# loading ASD-specific PPI links and nodes
asd_a_links <- read.table("results/string/asd_a_CSPPIs_links.txt", sep = "\t", header=TRUE)
asd_a_nodes <- read.table("results/string/asd_a_CSPPIs_nodes.txt", sep = "\t", header=TRUE)
asd_f_links <- read.table("results/string/asd_f_CSPPIs_links.txt", sep = "\t", header=TRUE)
asd_f_nodes <- read.table("results/string/asd_f_CSPPIs_nodes.txt", sep = "\t", header=TRUE)
## ASD CSPPIs in adult cortex
a_graph <- graph_from_data_frame(d=asd_a_links, vertices=asd_a_nodes, directed=F)
#E(a_graph) # the edges
#V(a_graph) # the vertices
#E(a_graph)$combined_score # edge attribute "combined_score"
#V(a_graph)$shared # vertex attribute "shared"
#V(a_graph)[shared=="yes"] # find edges by attribute
#a_graph[1,] # you can also examine the network matrix directly
#as_edgelist(a_graph, names=T) # get an edge list
#as_adjacency_matrix(a_graph, attr="combined_score") # get a matrix
#as_data_frame(a_graph, what="edges") # get a data frame describing edges
#as_data_frame(a_graph, what="vertices") # get a data frame describing nodes
# Louvain clustering
a_lc <- cluster_louvain(a_graph, weights=NA) #class(a_lc)
#membership(a_lc)
communities(a_lc) # 3 clusters
## $`1`
## [1] "APOPT1" "KLC1"
##
## $`2`
## [1] "HLA-DMA" "HLA-DQB1" "TAP1" "TAP2"
##
## $`3`
## [1] "AS3MT" "TRIM26" "ZSCAN31"
#plot(a_lc, a_graph, edge.color="orange", vertex.color="orange", vertex.frame.color="#ffffff", vertex.label.color="black")
col_nodes <- c("#92278F", "#939598")
col_names <- c("#1C75BC", "#ED1C24", "black")
set.seed(1234)
a <- layout_with_fr(a_graph)
#a <- layout_with_kk(aa_graph)
#pdf("figures/asd_a_CSPPI_network.pdf", width = 14, height = 10)
plot(a_graph, edge.width=2.5, edge.color="lightgray",
vertex.color=col_nodes[V(a_graph)$shared], vertex.frame.color="lightgray",
vertex.size=V(a_graph)$logTPM*7, vertex.label.color=col_names[V(a_graph)$regulation],
layout=a) # vertex.label=NA
#dev.off()
## ASD CSPPIs in fetal cortex
f_graph <- graph_from_data_frame(d=asd_f_links, vertices=asd_f_nodes, directed=F)
#E(f_graph) # the edges
#V(f_graph) # the vertices
#E(f_graph)$combined_score # edge attribute "combined_score"
#V(f_graph)$shared # vertex attribute "shared"
#V(f_graph)[shared=="yes"] # find edges by attribute
#f_graph[1,] # you can also examine the network matrix directly
#as_edgelist(f_graph, names=T) # get an edge list
#as_adjacency_matrix(f_graph, attr="combined_score") # get a matrix
#as_data_frame(f_graph, what="edges") # get a data frame describing edges
#as_data_frame(f_graph, what="vertices")
# Louvain clustering
f_lc <- cluster_louvain(f_graph, weights=NA)
#membership(f_lc)
communities(f_lc) # 7 clusters
## $`1`
## [1] "FADS1" "FADS2" "TBL1X"
##
## $`2`
## [1] "NDRG4" "RTN1"
##
## $`3`
## [1] "DHX38" "SF3B1" "THOC7"
##
## $`4`
## [1] "HCG27" "HLA-C" "HLA-DMA" "HLA-DMB" "HLA-DRB1" "HLA-F" "NDUFA6"
## [8] "TAP2"
##
## $`5`
## [1] "FTCDNL1" "TYW5"
##
## $`6`
## [1] "LY6G5C" "MPHOSPH9" "PGBD1" "TCF19" "VARS2" "VWA7" "ZKSCAN7"
## [8] "ZSCAN23"
##
## $`7`
## [1] "AS3MT" "BORCS7" "GNL3" "ITIH4" "MFSD13A" "NEK4" "NT5DC2"
## [8] "STAB1" "TMEM110" "WBP1L"
#plot(f_lc, f_graph, edge.color="orange", vertex.color="orange", vertex.frame.color="#ffffff", vertex.label.color="black")
col_nodes <- c("#009444", "#939598")
col_names <- c("#1C75BC", "#ED1C24", "black")
set.seed(1234)
f <- layout_with_fr(f_graph)
#pdf("figures/asd_f_CSPPI_network.pdf", width = 14, height = 10)
plot(f_graph, edge.width=2.5, edge.color="lightgray",
vertex.color=col_nodes[V(f_graph)$shared], vertex.frame.color="lightgray",
vertex.size=V(f_graph)$logTPM*7, vertex.label.color=col_names[V(f_graph)$regulation],
layout=f) # vertex.label=NA
#dev.off()
Pathway analysis was performed using the g:GOSt module of the g:Profiler tool. The significance level was determined using Benjamini-Hochberg algorithm (FDR < 0.05). Uncomment the lines below to query g:GOSt.
# This function quieries g:GOSt module of the g:Profiler tool. It takes a vector of genes and quieries the GOSt module. It outputs the dataframe with the query results for the genes.
query_kegg <- function(genes){
tryCatch({
t <- gost(query = genes, organism = "hsapiens", ordered_query = TRUE,
multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
measure_underrepresentation = FALSE, evcodes = TRUE,
user_threshold = 0.05, correction_method = "fdr",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = c("KEGG"), as_short_link = FALSE)
return(t)
}, error=function(e){
cat("ERROR: ", conditionMessage(e), "\n")
})
}
## Pathways analysis for 3 PPI clusters in adult cortex
## Uncomment the lines below to query g:GOSt
communities(a_lc)[1] # to get gene names
## $`1`
## [1] "APOPT1" "KLC1"
a_1 <- communities(a_lc)$'1' # to get gene names
#a_1_path <- query_kegg(a_1) # no pathways --> uknown cluster
#a_1_paths <- a_1_path$result
#a_1.df <- apply(a_1_paths, 2, as.character)
#write.table(a_1.df, file = "results/kegg/asd_a_CSPPI_cluster1_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
communities(a_lc)[2] # to get gene names
## $`2`
## [1] "HLA-DMA" "HLA-DQB1" "TAP1" "TAP2"
a_2 <- communities(a_lc)$'2' # to get gene names
#a_2_path <- query_kegg(a_2)
#a_2_paths <- a_2_path$result
#a_2.df <- apply(a_2_paths, 2, as.character)
#write.table(a_2.df, file = "results/paths/asd_a_CSPPI_cluster2_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
a_2.df <- read.table("results/kegg/asd_a_CSPPI_cluster2_kegg_fdr.txt", header = TRUE, sep = "\t")
communities(a_lc)[3] # to get gene names
## $`3`
## [1] "AS3MT" "TRIM26" "ZSCAN31"
a_3 <- communities(a_lc)$'3' # to get gene names
#a_3_path <- query_kegg(a_3) # no pathways --> uknown cluster
#a_3_paths <- a_3_path$result
#a_3.df <- apply(a_3_paths, 2, as.character)
#write.table(a_3.df, file = "results/kegg/asd_a_CSPPI_cluster3_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
## Pathways analysis for 3 PPI clusters in fetal cortex
communities(f_lc)[1] # to get gene names
## $`1`
## [1] "FADS1" "FADS2" "TBL1X"
f_1 <- communities(f_lc)$'1' # to get gene names
#f_1_path <- query_kegg(f_1)
#f_1_paths <- f_1_path$result
#f_1.df <- apply(f_1_paths, 2, as.character)
#write.table(f_1.df, file = "results/kegg/asd_f_CSPPI_cluster1_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_1.df <- read.table("results/kegg/asd_f_CSPPI_cluster1_kegg_fdr.txt", header = TRUE, sep = "\t")
communities(f_lc)[2] # to get gene names
## $`2`
## [1] "NDRG4" "RTN1"
f_2 <- communities(f_lc)$'2' # to get gene names
#f_2_path <- query_kegg(f_2) # no pathways --> uknown cluster
#f_2_paths <- f_2_path$result
#f_2.df <- apply(f_2_paths, 2, as.character)
#write.table(f_2.df, file = "results/kegg/asd_f_CSPPI_cluster2_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
communities(f_lc)[3] # to get gene names
## $`3`
## [1] "DHX38" "SF3B1" "THOC7"
f_3 <- communities(f_lc)$'3' # to get gene names
#f_3_path <- query_kegg(f_3)
#f_3_paths <- f_3_path$result
#f_3.df <- apply(f_3_paths, 2, as.character)
#write.table(f_3.df, file = "results/kegg/asd_f_CSPPI_cluster3_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_3.df <- read.table("results/kegg/asd_f_CSPPI_cluster3_kegg_fdr.txt", header = TRUE, sep = "\t")
communities(f_lc)[4] # to get gene names
## $`4`
## [1] "HCG27" "HLA-C" "HLA-DMA" "HLA-DMB" "HLA-DRB1" "HLA-F" "NDUFA6"
## [8] "TAP2"
f_4 <- communities(f_lc)$'4' # to get gene names
#f_4_path <- query_kegg(f_4)
#f_4_paths <- f_4_path$result
#f_4.df <- apply(f_4_paths, 2, as.character)
#write.table(f_4.df, file = "results/kegg/asd_f_CSPPI_cluster4_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_4.df <- read.table("results/kegg/asd_f_CSPPI_cluster4_kegg_fdr.txt", header = TRUE, sep = "\t")
communities(f_lc)[5] # to get gene names
## $`5`
## [1] "FTCDNL1" "TYW5"
f_5 <- communities(f_lc)$'5' # to get gene names
#f_5_path <- query_kegg(f_5) # no pathways --> uknown cluster
#f_5_paths <- f_5_path$result
#f_5.df <- apply(f_5_paths, 2, as.character)
#write.table(f_5.df, file = "results/kegg/asd_f_CSPPI_cluster5_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
communities(f_lc)[6] # to get gene names
## $`6`
## [1] "LY6G5C" "MPHOSPH9" "PGBD1" "TCF19" "VARS2" "VWA7" "ZKSCAN7"
## [8] "ZSCAN23"
f_6 <- communities(f_lc)$'6' # to get gene names
#f_6_path <- query_kegg(f_6)
#f_6_paths <- f_6_path$result
#f_6.df <- apply(f_6_paths, 2, as.character)
#write.table(f_6.df, file = "results/kegg/asd_f_CSPPI_cluster6_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_6.df <- read.table("results/kegg/asd_f_CSPPI_cluster6_kegg_fdr.txt", header = TRUE, sep = "\t")
communities(f_lc)[7] # to get gene names
## $`7`
## [1] "AS3MT" "BORCS7" "GNL3" "ITIH4" "MFSD13A" "NEK4" "NT5DC2"
## [8] "STAB1" "TMEM110" "WBP1L"
f_7 <- communities(f_lc)$'7' # to get gene names
#f_7_path <- query_kegg(f_7)
#f_7_paths <- f_7_path$result
#f_7.df <- apply(f_7_paths, 2, as.character)
#write.table(f_7.df, file = "results/kegg/asd_f_CSPPI_cluster7_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
f_7.df <- read.table("results/kegg/asd_f_CSPPI_cluster7_kegg_fdr.txt", header = TRUE, sep = "\t")